package xiaoa.java.spider;

import java.io.File;



import java.net.URL;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.CountDownLatch;

import org.apache.commons.io.FileUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.alibaba.fastjson.JSON;

import xiaoa.java.utils.HttpUtils;


public class BaiduSearchSpider {
	
	
	
	/**
	 * 返回抓取字符串
	 * @Title: fetch
	 * @param offset
	 * @return
	 * @throws Throwable
	 * @author xiaoa
	 */
	public static  String fetch(int offset)throws Throwable{
		
		String  html = HttpUtils.doGet("https://www.baidu.com/s?wd=%E6%94%BF%E5%BA%9C%E7%95%99%E8%A8%80%E6%9D%BF&pn=" + offset*10 + "&oq=%E6%94%BF%E5%BA%9C%E7%95%99%E8%A8%80%E6%9D%BF&tn=baiduhome_pg&ie=utf-8&rsv_idx=2&rsv_pq=93ab2d7b00015422&rsv_t=5759rT2stFS5HymsjQqNr8Ot8UwmG3AQumj9FdqY26JpeBs%2Blep4SjQ9bzCnSs2%2Fz8M1&gpc=stf%3D1475251200%2C1487865600%7Cstftype%3D2&tfflag=1").asString();
		
		return html;
	}
	
	
	/**
	 * 解析
	 * @Title: parse
	 * @param body
	 * @return
	 * @throws Throwable
	 * @author xiaoa
	 */
	public static List<SreachVo> parse(String body)throws Throwable{
		
		if (body == null || body.equals("")){
			System.out.println("===================== body is null");
			return null;
		}
		
		// 解析html
		Document   doc     =  Jsoup.parse(body);
		
		//  获取标签
		Elements elements  =  doc.select(".c-tools");
		
		List<SreachVo>  list  = new LinkedList<SreachVo>();
		
		for (int i =0 ; i < elements.size() ; i ++){
			
			Element  e    = elements.get(i);
			
			String jsonStr = e.attr("data-tools");
			
			if (jsonStr != null && !jsonStr.equals("")){
				
				SreachVo  vo = JSON.parseObject(jsonStr, SreachVo.class);
				
				if (vo != null){
					list.add(vo);
				}
				
			}
		}
		
		return list;
	}
	
	
	public static void fill(List<SreachVo>  list , int threads)throws Throwable{
		
		if (list == null){
			throw new RuntimeException("参数有误");
		}
		
		// 创建一个vo队列
		final  BlockingQueue<SreachVo>   urlQueue = new ArrayBlockingQueue<SreachVo>(10000);
		
		// 填充url到队列
		urlQueue.addAll(list);
		
		// 创建一个计数器
		final  CountDownLatch   latch  = new CountDownLatch(threads);
		

		// 线程
		Runnable  run  = new Runnable() {
			@Override
			public void run() {
				// 开启线程
				latch.getCount();
				
				try {
					while (!urlQueue.isEmpty()) {
						SreachVo  vo  = urlQueue.take();
						
						if (vo != null){
							
							URL  url  = new URL(vo.url);
							
							System.out.println("=================== url = " + url);
							
							String realUrl = getForwardUrl(url);
							
							if (realUrl != null && !realUrl.equals("")){
								vo.realUrl = realUrl;
							}
							
						}
						
						
					}
					
				} catch (Throwable e) {
					e.printStackTrace();
				}finally {
					latch.countDown();;
				}
				
			}
		};
		
		// 创建线程
		for (int i = 0 ; i < threads ; i ++){
		
			Thread  thread = new Thread(run);
			
			thread.setName("线程 ：" + i);
			
			thread.start();
			
		}
		
		// 主线程等待完成
		latch.await();
		
		System.out.println("============== 完成 ");
		
	}
	
	
	
	/**
	 * 获取转发真实url
	 * @Title: getBaiduUrl
	 * @param url
	 * @return
	 * @throws Throwable
	 * @author xiaoa
	 */
	private static String getForwardUrl(URL url )throws Throwable{
		if (url == null){
			throw new RuntimeException("参数有误!");
		}
		
    	Connection.Response res = Jsoup.connect("https://www.baidu.com/link?url=30Lh6s_hQxte7oQWR-iDDUukQTwLzYP2hAUovPel48hvBwD8jIJ0F6gZZwy70UFO")
						    			.timeout(60000)
						    			.method(Connection.Method.GET)
						    			.followRedirects(false)
						    			.execute();
	     String locationUrl= res.header("Location");
		
		return locationUrl;
		
	}
	
	
	/**
	 * 对象vo
	 * @author xiaoa
	 * @date 2017年2月24日 下午4:38:33
	 * @version V1.0
	 *
	 */
	public static class SreachVo{
		
		// 标题
		String title ;
		
		// 原始链接
		String url;
		
		// 真实链接
		String realUrl;

		public String getTitle() {
			return title;
		}

		public void setTitle(String title) {
			this.title = title;
		}

		public String getUrl() {
			return url;
		}

		public void setUrl(String url) {
			this.url = url;
		}

		public String getRealUrl() {
			return realUrl;
		}

		public void setRealUrl(String realUrl) {
			this.realUrl = realUrl;
		}
		
	}
	
	
	public static void main(String[] args) throws Throwable {

		List<SreachVo>  list = new LinkedList<SreachVo>();
	
		// 爬取页数
		for (int i = 0 ; i < 64 ; i ++ ){
			
			// 获取网页资源
			String body = fetch(i);
			
			// 解析
			List<SreachVo>  iList = parse(body);
			
			// 添加到集合中
			list.addAll(iList);
		}
		
		
		// 获取真实链接
		fill(list, 30);
		
		String json = JSON.toJSONString(list);	
		
		FileUtils.writeStringToFile(new File("e://百度数据.json"), json, "utf-8");
		
	}
	
	

}
